# load the data into the environment
data('mnist')
mnist_y = mnist[['Ytrain']]
mnist_X = data.frame(mnist[['Xtrain']])
mnist_X[['Group']] = as.factor(mnist_y)

Basic Targets

Following the application of three dimensionality methods (PCA, t-SNE, UMAP) last time, we want to further investigate the different hyperparameters of the last two methods (the first one is a linear method with no hyper-parameter specified).

Hyperparameter-Tuning for t-SNE

Once again, we start with the default parameters on the MNIST-6000 dataset.

tsne.plot <- function(perpl=30, iterations=500, learning=200){
  tsne <- Rtsne(mnist_X[,-785], dims = 2, perplexity=perpl, verbose=TRUE, max_iter=iterations, eta=learning, partial_pca=TRUE)
  return(tsne)
}

# create the graph two times
tsne_default1 = tsne.plot()
tsne_default2 = tsne.plot()

default_plot_a = ggplot() + geom_point(aes(x = tsne_default1$Y[, 1], y = (x = tsne_default1$Y[, 2]), color = mnist_X$Group), size=1) + labs(title = "Default TSNE 1", x = "t-SNE 1", y = "t-SNE 2", color = "colors") + coord_fixed()

default_plot_b = ggplot() + geom_point(aes(x = tsne_default2$Y[, 1], y = (x = tsne_default2$Y[, 2]), color = mnist_X$Group), size=1) + labs(title = "Default TSNE 2", x = "t-SNE 1", y = "t-SNE 2", color = "colors") + coord_fixed()

ggsave("default_tsne.png", ggarrange(default_plot_a, default_plot_b, ncol = 2, nrow = 1))

default_tsne Without fixing the seed for the R random generator, we see that running the t-SNE algorithm with the same set of parameters for two times gives two starkly different graphs. The above graph also displays a key property of t-SNE: while it does a good job preserving local distances in the high dimensional space (points closed to each other), it fails to maintain distances for relatively distant points (e.g. the point clouds for digit 3 green and 0 red).

perps = c(2, 5, 10, 30, 50, 100)
counter = 1

tsne_results = list()

for (perp in perps) {
  tsne_results[[counter]] = tsne.plot(perpl = perp)
  counter = counter + 1
}

p1 = ggplot() + geom_point(aes(x = tsne_results[[1]]$Y[, 1], y = tsne_results[[1]]$Y[, 2], color = mnist_X$Group), size=0.5) + labs(x = "t-SNE 1", y = "t-SNE 2", color = "colors") + coord_fixed()

p2 = ggplot() + geom_point(aes(x = tsne_results[[2]]$Y[, 1], y = tsne_results[[2]]$Y[, 2], color = mnist_X$Group), size=0.5) + labs(x = "t-SNE 1", y = "t-SNE 2", color = "colors") + coord_fixed()

p3 = ggplot() + geom_point(aes(x = tsne_results[[3]]$Y[, 1], y = tsne_results[[3]]$Y[, 2], color = mnist_X$Group), size=0.5) + labs(x = "t-SNE 1", y = "t-SNE 2", color = "colors") + coord_fixed()

p4 = ggplot() + geom_point(aes(x = tsne_results[[4]]$Y[, 1], y = tsne_results[[4]]$Y[, 2], color = mnist_X$Group), size=0.5) + labs(x = "t-SNE 1", y = "t-SNE 2", color = "colors") + coord_fixed()

p5 = ggplot() + geom_point(aes(x = tsne_results[[5]]$Y[, 1], y = tsne_results[[5]]$Y[, 2], color = mnist_X$Group), size=0.5) + labs(x = "t-SNE 1", y = "t-SNE 2", color = "colors") + coord_fixed()

p6 = ggplot() + geom_point(aes(x = tsne_results[[6]]$Y[, 1], y = tsne_results[[6]]$Y[, 2], color = mnist_X$Group), size=0.5) + labs(x = "t-SNE 1", y = "t-SNE 2", color = "colors") + coord_fixed()

ggsave("tsne-perplexity.png", ggarrange(p1, p2, p3, p4, p5, p6, ncol = 2, nrow = 3, labels = perps))
tsne-initial-pca

tsne-initial-pca

Secondly, we observe that most t-SNE implementations would first perform PCA on the data to shrink the dimension before continue dimensionality reduction. In our case the default PCA dimension is 50. So we can slightly change this value.

pca_20 = Rtsne(mnist_X[,-785], dims = 2, perplexity=10, max_iter=500, eta=200, partial_pca=TRUE, initial_dims=20)
pca_50 = Rtsne(mnist_X[,-785], dims = 2, perplexity=10, max_iter=500, eta=200, partial_pca=TRUE)
pca_100 = Rtsne(mnist_X[,-785], dims = 2, perplexity=10, max_iter=500, eta=200, partial_pca=TRUE, initial_dims=100)

set.seed(6324)
p1 = ggplot() + geom_point(aes(x = pca_20$Y[, 1], y = pca_20$Y[, 2], color = mnist_X$Group), size=0.5) + labs(x = "t-SNE 1", y = "t-SNE 2", color = "colors") + coord_fixed()
set.seed(6324)
p2 = ggplot() + geom_point(aes(x = pca_50$Y[, 1], y = pca_50$Y[, 2], color = mnist_X$Group), size=0.5) + labs(x = "t-SNE 1", y = "t-SNE 2", color = "colors") + coord_fixed()
set.seed(6324)
p3 = ggplot() + geom_point(aes(x = pca_100$Y[, 1], y = pca_100$Y[, 2], color = mnist_X$Group), size=0.5) + labs(x = "t-SNE 1", y = "t-SNE 2", color = "colors") + coord_fixed()

ggsave("tsne-inital-pca.png", ggarrange(p1, p2, p3, ncol = 2, nrow = 2, labels = c("20", "50", "100")))

tsne-initial-pca ## Hyperparameter Tuning for UMAP

According to the documentation: > min_dist: numeric; determines how close points appear in the final layout > spread: numeric; used during automatic estimation of a/b parameters.

Once again, we start by having a look at how UMAP can perserve local/nonlocal distances in comparison to t-SNE.

tune_umap <- function(spread=1, min_dist=0.1, rand_seed=2020) {
  config = umap.defaults
  config$spread = spread
  config$min_dist = min_dist
  config$random_state = rand_seed
  return(umap(mnist_X[,-785], config))
}

umap_default_a = tune_umap(rand_seed = 2018)
umap_default_b = tune_umap(rand_seed = 1918)

p1 = ggplot() + geom_point(aes(x = umap_default_a$layout[, 1], y = (x = umap_default_a$layout[, 2]), color = mnist_X$Group), size=0.5) + labs(x = "UMAP 1", y = "UMAP 2", color = "colors") + coord_fixed()
p2 = ggplot() + geom_point(aes(x = umap_default_b$layout[, 1], y = (x = umap_default_b$layout[, 2]), color = mnist_X$Group), size=0.5) + labs(x = "UMAP 1", y = "UMAP 2", color = "colors") + coord_fixed()

ggsave("umap-default.png", ggarrange(p1, p2, ncol = 2, nrow = 1))

umap-default Tuning min_dist and spread:

dists = c(0.0001, 0.001, 0.1, 0.5, 1, 1.5)
counter = 1

umap_results = list()

for (dist in dists) {
  umap_results[[counter]] = tune_umap(min_dist = dist, spread=2)
  counter = counter + 1
}

p1 = ggplot() + geom_point(aes(x = umap_results[[1]]$layout[, 1], y = umap_results[[1]]$layout[, 2], color = mnist_X$Group), size=0.5) + labs(x = "t-SNE 1", y = "t-SNE 2", color = "colors") + coord_fixed()

p2 = ggplot() + geom_point(aes(x = umap_results[[2]]$layout[, 1], y = umap_results[[2]]$layout[, 2], color = mnist_X$Group), size=0.5) + labs(x = "t-SNE 1", y = "t-SNE 2", color = "colors") + coord_fixed()

p3 = ggplot() + geom_point(aes(x = umap_results[[3]]$layout[, 1], y = umap_results[[3]]$layout[, 2], color = mnist_X$Group), size=0.5) + labs(x = "t-SNE 1", y = "t-SNE 2", color = "colors") + coord_fixed()

p4 = ggplot() + geom_point(aes(x = umap_results[[4]]$layout[, 1], y = umap_results[[4]]$layout[, 2], color = mnist_X$Group), size=0.5) + labs(x = "t-SNE 1", y = "t-SNE 2", color = "colors") + coord_fixed()

p5 = ggplot() + geom_point(aes(x = umap_results[[5]]$layout[, 1], y = umap_results[[5]]$layout[, 2], color = mnist_X$Group), size=0.5) + labs(x = "t-SNE 1", y = "t-SNE 2", color = "colors") + coord_fixed()

p6 = ggplot() + geom_point(aes(x = umap_results[[6]]$layout[, 1], y = umap_results[[6]]$layout[, 2], color = mnist_X$Group), size=0.5) + labs(x = "t-SNE 1", y = "t-SNE 2", color = "colors") + coord_fixed()

ggsave("umap-mindists.png", ggarrange(p1, p2, p3, p4, p5, p6, ncol = 2, nrow = 3, labels = dists))
umap-mindists

umap-mindists

spreads = c(0.11, 0.2, 0.5, 1, 2, 5)
counter = 1

umap_results = list()

for (s in spreads) {
  umap_results[[counter]] = tune_umap(spread = s, min_dist = 0.1)
  counter = counter + 1
}

p1 = ggplot() + geom_point(aes(x = umap_results[[1]]$layout[, 1], y = umap_results[[1]]$layout[, 2], color = mnist_X$Group), size=0.5) + labs(x = "t-SNE 1", y = "t-SNE 2", color = "colors") + coord_fixed()

p2 = ggplot() + geom_point(aes(x = umap_results[[2]]$layout[, 1], y = umap_results[[2]]$layout[, 2], color = mnist_X$Group), size=0.5) + labs(x = "t-SNE 1", y = "t-SNE 2", color = "colors") + coord_fixed()

p3 = ggplot() + geom_point(aes(x = umap_results[[3]]$layout[, 1], y = umap_results[[3]]$layout[, 2], color = mnist_X$Group), size=0.5) + labs(x = "t-SNE 1", y = "t-SNE 2", color = "colors") + coord_fixed()

p4 = ggplot() + geom_point(aes(x = umap_results[[4]]$layout[, 1], y = umap_results[[4]]$layout[, 2], color = mnist_X$Group), size=0.5) + labs(x = "t-SNE 1", y = "t-SNE 2", color = "colors") + coord_fixed()

p5 = ggplot() + geom_point(aes(x = umap_results[[5]]$layout[, 1], y = umap_results[[5]]$layout[, 2], color = mnist_X$Group), size=0.5) + labs(x = "t-SNE 1", y = "t-SNE 2", color = "colors") + coord_fixed()

p6 = ggplot() + geom_point(aes(x = umap_results[[6]]$layout[, 1], y = umap_results[[6]]$layout[, 2], color = mnist_X$Group), size=0.5) + labs(x = "t-SNE 1", y = "t-SNE 2", color = "colors") + coord_fixed()

ggsave("umap-spreads.png", ggarrange(p1, p2, p3, p4, p5, p6, ncol = 2, nrow = 3, labels = spreads))
umap-spreads

umap-spreads